Extracting hourly temperature data from NOAA ISD (integrated surface database) weather data

ish_parser python module is from: https://github.com/haydenth/ish_parser


In [1]:
# boilerplate includes
import sys
import os

import numpy as np
import matplotlib as mpl
#mpl.use('nbagg')
import matplotlib.pyplot as plt
#from mpl_toolkits.mplot3d import Axes3D

#import mpld3 # for outputting interactive html figures

import pandas as pd
import seaborn as sns

import ish_parser
import gzip
import ftplib
import io

from IPython.display import display, HTML
%matplotlib notebook
plt.style.use('seaborn-notebook')

pd.set_option('display.max_columns', None)

In [11]:
# PARAMETERS (might be overridden by a calling script)

# if not calling from another script (batch), SUBNOTEBOOK_FLAG might not be defined
try:
    SUBNOTEBOOK_FLAG
except NameError:
    SUBNOTEBOOK_FLAG = False
    
# Not calling as a sub-script? define params here
if not SUBNOTEBOOK_FLAG:
    
    # SET PARAMETER VARIABLES HERE UNLESS CALLING USING %run FROM ANOTHER NOTEBOOK
    
    STATION_CALLSIGN = 'KLAX'

    USE_CACHED_STATION_H5_FILES = True
    SUPPRESS_FIGURE_DISPLAY = False

    DATADIR = '../data/temperatures/ISD'
    OUTDIR = '../data/temperatures'

    FTPHOST = 'ftp.ncdc.noaa.gov'
    FETCH_STATIONS_LIST_FILE = True
    
print("Fetching and parsing ",STATION_CALLSIGN)


Fetching and parsing  KLAX

Locate the station info...

Could either do it by hand, or else try to get all the data associated with a single station callsign. The latter seems like a cooler way to go... but have to be careful that the stations really are the same and the data is comparable for our purposes.

stations list: ftp://ftp.ncdc.noaa.gov/pub/data/noaa/isd-history.txt


In [12]:
if FETCH_STATIONS_LIST_FILE:
    # fetch a fresh copy of the stations list
    with open(os.path.join(DATADIR,'isd-history.txt'),'wb') as fh:
        with ftplib.FTP(host=FTPHOST) as ftpconn:
            ftpconn.login()
            ftpconn.retrbinary('RETR '+'/pub/data/noaa/isd-history.txt', fh.write)
            ftpconn.close()

In [13]:
# function to parse stations list file
def read_isd_history_stations_list(filename, skiprows=22):
    """Read and parse stations information from isd_history.txt file"""
    fwfdef = (( ('USAF', (6, str)),
                ('WBAN', (5, str)),
                ('STATION NAME', (28, str)),
                ('CTRY', (4, str)),
                ('ST', (2, str)),
                ('CALL', (5, str)),
                ('LAT', (7, str)),
                ('LON', (8, str)),
                ('EVEV', (7, str)),
                ('BEGIN', (8, str)),
                ('END', (8, str)),
                ))
    names = []
    colspecs = []
    converters = {}
    i = 0
    for k,v in fwfdef:
        names.append(k)
        colspecs.append((i, i+v[0]+1))
        i += v[0]+1
        converters[k] = v[1]
    stdf = pd.read_fwf(filename, skiprows=skiprows,
                       names=names,
                       colspecs=colspecs,
                       converters=converters)
    return stdf

In [14]:
# actually parse the file
stationsdf = read_isd_history_stations_list(
    os.path.join(DATADIR,'isd-history.txt'))

In [15]:
# pick just the info associated with the station we want
station_info = stationsdf[stationsdf['CALL'] == STATION_CALLSIGN]
station_info


Out[15]:
USAF WBAN STATION NAME CTRY ST CALL LAT LON EVEV BEGIN END
18862 722950 23174 LOS ANGELES INTERNATIONAL AIR US CA KLAX +33.938 -118.389 +0029.6 19440101 20170819
29330 999999 23174 LOS ANGELES MUNICIPAL ARPT US CA KLAX +33.938 -118.389 +0099.4 19470101 19721231

In [16]:
# # maybe only use a subset of these entires
# station_info = station_info.iloc[2:3]
# station_info

Getting the actual data

data files are at:ftp://ftp.ncdc.noaa.gov/pub/data/noaa/{YEAR}/{USAF}-{WBAN}-{YEAR}.gz


In [17]:
def download_ish_data(usaf_id, wban_id, years_to_get, 
                      ftp_host=FTPHOST,
                      verbose=True):
    parser = ish_parser.ish_parser()
    with ftplib.FTP(host=ftp_host) as ftpconn:
        ftpconn.login()
        for year in years_to_get:
            ftp_file = "/pub/data/noaa/{YEAR}/{USAF}-{WBAN}-{YEAR}.gz".format(
                USAF=usaf_id, WBAN=wban_id, YEAR=year)
            if verbose:
                print(ftp_file)
            # read the whole file and save it to a BytesIO (stream)
            response = io.BytesIO()
            try:
                ftpconn.retrbinary('RETR '+ftp_file, response.write)
            except ftplib.error_perm as err:
                if str(err).startswith('550 '):
                    print('ERROR:', err)
                else:
                    raise
            # decompress and parse each line 
            response.seek(0) # jump back to the beginning of the stream
            with gzip.open(response, mode='rb') as gzstream:
                for line in gzstream:
                    parser.loads(line.decode('latin-1'))
    # get the list of all reports
    reports = parser.get_reports()
    if verbose:
        print(len(reports), "records")
    # just return None if no records were found
    if len(reports) <= 0:
        return None
    # convert to a pandas dataframe
    foo = pd.DataFrame.from_records(
                ((r.datetime, r.air_temperature.get_numeric()) for r in reports),
                columns=['datetime','AT'],
                index='datetime')
    foo.index = pd.to_datetime(foo.index) # convert the index to pandas datetime objects
    foo.dropna(inplace=True) # drop entires which don't have an AT value
    foo.sort_index(inplace=True) # go ahead and ensure it is sorted
    return foo

In [18]:
%%time
df = None
for _,row in station_info.iterrows():
    usaf_id = row['USAF']
    wban_id = row['WBAN']
    years_to_get = range(int(row['BEGIN'][0:4]), int(row['END'][0:4])+1)
    print('####', usaf_id, wban_id, years_to_get)

    station_h5file = os.path.join(DATADIR,
                "{USAF}-{WBAN}-AT.h5".format(USAF=usaf_id, WBAN=wban_id))

    station_df = None
    if USE_CACHED_STATION_H5_FILES:
        if os.path.isfile(station_h5file):
            print("Using cached file: '{}'".format(station_h5file))
            station_df = pd.read_hdf(station_h5file, 'table')

    if station_df is None:
        station_df = download_ish_data(usaf_id, wban_id, years_to_get, ftp_host=FTPHOST)
    
    if station_df is None:
        print("WARNING: No data found for {} {} {}".format(usaf_id, wban_id, years_to_get))
    else:
    
        # Save this station's individual data
        print("Saving station data to: '{}'".format(station_h5file))
        station_df.to_hdf(station_h5file,'table')

        # Combine into single dataset
        if df is None:
            df = station_df.copy(deep=True)
        else:
            # @TCC TODO: Maybe use some more clever logic than just "combine_first"
            df = df.combine_first(station_df)

# ensure the final combined dataset is sorted
df.sort_index(inplace=True)

# save the combined datafram
combined_AT_filename = "{}_AT.h5".format(STATION_CALLSIGN)
print("Saving combined data to: '{}'".format(combined_AT_filename))
df.to_hdf(os.path.join(DATADIR, combined_AT_filename),'table')


#### 722950 23174 range(1944, 2018)
/pub/data/noaa/1944/722950-23174-1944.gz
/pub/data/noaa/1945/722950-23174-1945.gz
ERROR: 550 /pub/data/noaa/1945/722950-23174-1945.gz: No such file or directory
/pub/data/noaa/1946/722950-23174-1946.gz
ERROR: 550 /pub/data/noaa/1946/722950-23174-1946.gz: No such file or directory
/pub/data/noaa/1947/722950-23174-1947.gz
ERROR: 550 /pub/data/noaa/1947/722950-23174-1947.gz: No such file or directory
/pub/data/noaa/1948/722950-23174-1948.gz
ERROR: 550 /pub/data/noaa/1948/722950-23174-1948.gz: No such file or directory
/pub/data/noaa/1949/722950-23174-1949.gz
ERROR: 550 /pub/data/noaa/1949/722950-23174-1949.gz: No such file or directory
/pub/data/noaa/1950/722950-23174-1950.gz
ERROR: 550 /pub/data/noaa/1950/722950-23174-1950.gz: No such file or directory
/pub/data/noaa/1951/722950-23174-1951.gz
ERROR: 550 /pub/data/noaa/1951/722950-23174-1951.gz: No such file or directory
/pub/data/noaa/1952/722950-23174-1952.gz
ERROR: 550 /pub/data/noaa/1952/722950-23174-1952.gz: No such file or directory
/pub/data/noaa/1953/722950-23174-1953.gz
ERROR: 550 /pub/data/noaa/1953/722950-23174-1953.gz: No such file or directory
/pub/data/noaa/1954/722950-23174-1954.gz
ERROR: 550 /pub/data/noaa/1954/722950-23174-1954.gz: No such file or directory
/pub/data/noaa/1955/722950-23174-1955.gz
ERROR: 550 /pub/data/noaa/1955/722950-23174-1955.gz: No such file or directory
/pub/data/noaa/1956/722950-23174-1956.gz
ERROR: 550 /pub/data/noaa/1956/722950-23174-1956.gz: No such file or directory
/pub/data/noaa/1957/722950-23174-1957.gz
ERROR: 550 /pub/data/noaa/1957/722950-23174-1957.gz: No such file or directory
/pub/data/noaa/1958/722950-23174-1958.gz
ERROR: 550 /pub/data/noaa/1958/722950-23174-1958.gz: No such file or directory
/pub/data/noaa/1959/722950-23174-1959.gz
ERROR: 550 /pub/data/noaa/1959/722950-23174-1959.gz: No such file or directory
/pub/data/noaa/1960/722950-23174-1960.gz
ERROR: 550 /pub/data/noaa/1960/722950-23174-1960.gz: No such file or directory
/pub/data/noaa/1961/722950-23174-1961.gz
ERROR: 550 /pub/data/noaa/1961/722950-23174-1961.gz: No such file or directory
/pub/data/noaa/1962/722950-23174-1962.gz
ERROR: 550 /pub/data/noaa/1962/722950-23174-1962.gz: No such file or directory
/pub/data/noaa/1963/722950-23174-1963.gz
ERROR: 550 /pub/data/noaa/1963/722950-23174-1963.gz: No such file or directory
/pub/data/noaa/1964/722950-23174-1964.gz
ERROR: 550 /pub/data/noaa/1964/722950-23174-1964.gz: No such file or directory
/pub/data/noaa/1965/722950-23174-1965.gz
ERROR: 550 /pub/data/noaa/1965/722950-23174-1965.gz: No such file or directory
/pub/data/noaa/1966/722950-23174-1966.gz
ERROR: 550 /pub/data/noaa/1966/722950-23174-1966.gz: No such file or directory
/pub/data/noaa/1967/722950-23174-1967.gz
ERROR: 550 /pub/data/noaa/1967/722950-23174-1967.gz: No such file or directory
/pub/data/noaa/1968/722950-23174-1968.gz
ERROR: 550 /pub/data/noaa/1968/722950-23174-1968.gz: No such file or directory
/pub/data/noaa/1969/722950-23174-1969.gz
ERROR: 550 /pub/data/noaa/1969/722950-23174-1969.gz: No such file or directory
/pub/data/noaa/1970/722950-23174-1970.gz
ERROR: 550 /pub/data/noaa/1970/722950-23174-1970.gz: No such file or directory
/pub/data/noaa/1971/722950-23174-1971.gz
ERROR: 550 /pub/data/noaa/1971/722950-23174-1971.gz: No such file or directory
/pub/data/noaa/1972/722950-23174-1972.gz
ERROR: 550 /pub/data/noaa/1972/722950-23174-1972.gz: No such file or directory
/pub/data/noaa/1973/722950-23174-1973.gz
/pub/data/noaa/1974/722950-23174-1974.gz
/pub/data/noaa/1975/722950-23174-1975.gz
/pub/data/noaa/1976/722950-23174-1976.gz
/pub/data/noaa/1977/722950-23174-1977.gz
/pub/data/noaa/1978/722950-23174-1978.gz
/pub/data/noaa/1979/722950-23174-1979.gz
/pub/data/noaa/1980/722950-23174-1980.gz
/pub/data/noaa/1981/722950-23174-1981.gz
/pub/data/noaa/1982/722950-23174-1982.gz
/pub/data/noaa/1983/722950-23174-1983.gz
/pub/data/noaa/1984/722950-23174-1984.gz
/pub/data/noaa/1985/722950-23174-1985.gz
/pub/data/noaa/1986/722950-23174-1986.gz
/pub/data/noaa/1987/722950-23174-1987.gz
/pub/data/noaa/1988/722950-23174-1988.gz
/pub/data/noaa/1989/722950-23174-1989.gz
/pub/data/noaa/1990/722950-23174-1990.gz
/pub/data/noaa/1991/722950-23174-1991.gz
/pub/data/noaa/1992/722950-23174-1992.gz
/pub/data/noaa/1993/722950-23174-1993.gz
/pub/data/noaa/1994/722950-23174-1994.gz
/pub/data/noaa/1995/722950-23174-1995.gz
/pub/data/noaa/1996/722950-23174-1996.gz
/pub/data/noaa/1997/722950-23174-1997.gz
/pub/data/noaa/1998/722950-23174-1998.gz
/pub/data/noaa/1999/722950-23174-1999.gz
/pub/data/noaa/2000/722950-23174-2000.gz
/pub/data/noaa/2001/722950-23174-2001.gz
/pub/data/noaa/2002/722950-23174-2002.gz
/pub/data/noaa/2003/722950-23174-2003.gz
/pub/data/noaa/2004/722950-23174-2004.gz
WARNING:root:unable to load report, error: Non matching lengths. Expected 1407, got 1346
WARNING:root:unable to load report, error: Non matching lengths. Expected 1409, got 1348
WARNING:root:unable to load report, error: Non matching lengths. Expected 1425, got 1348
WARNING:root:unable to load report, error: Non matching lengths. Expected 1423, got 1346
WARNING:root:unable to load report, error: Non matching lengths. Expected 1423, got 1346
WARNING:root:unable to load report, error: Non matching lengths. Expected 1351, got 1290
WARNING:root:unable to load report, error: Non matching lengths. Expected 1425, got 1348
WARNING:root:unable to load report, error: Non matching lengths. Expected 1407, got 1346
/pub/data/noaa/2005/722950-23174-2005.gz
/pub/data/noaa/2006/722950-23174-2006.gz
/pub/data/noaa/2007/722950-23174-2007.gz
/pub/data/noaa/2008/722950-23174-2008.gz
/pub/data/noaa/2009/722950-23174-2009.gz
/pub/data/noaa/2010/722950-23174-2010.gz
/pub/data/noaa/2011/722950-23174-2011.gz
/pub/data/noaa/2012/722950-23174-2012.gz
/pub/data/noaa/2013/722950-23174-2013.gz
/pub/data/noaa/2014/722950-23174-2014.gz
/pub/data/noaa/2015/722950-23174-2015.gz
/pub/data/noaa/2016/722950-23174-2016.gz
/pub/data/noaa/2017/722950-23174-2017.gz
552567 records
Saving station data to: '../data/temperatures/ISD/722950-23174-AT.h5'
#### 999999 23174 range(1947, 1973)
/pub/data/noaa/1947/999999-23174-1947.gz
/pub/data/noaa/1948/999999-23174-1948.gz
/pub/data/noaa/1949/999999-23174-1949.gz
/pub/data/noaa/1950/999999-23174-1950.gz
/pub/data/noaa/1951/999999-23174-1951.gz
/pub/data/noaa/1952/999999-23174-1952.gz
/pub/data/noaa/1953/999999-23174-1953.gz
/pub/data/noaa/1954/999999-23174-1954.gz
/pub/data/noaa/1955/999999-23174-1955.gz
/pub/data/noaa/1956/999999-23174-1956.gz
/pub/data/noaa/1957/999999-23174-1957.gz
/pub/data/noaa/1958/999999-23174-1958.gz
/pub/data/noaa/1959/999999-23174-1959.gz
/pub/data/noaa/1960/999999-23174-1960.gz
/pub/data/noaa/1961/999999-23174-1961.gz
/pub/data/noaa/1962/999999-23174-1962.gz
/pub/data/noaa/1963/999999-23174-1963.gz
/pub/data/noaa/1964/999999-23174-1964.gz
/pub/data/noaa/1965/999999-23174-1965.gz
/pub/data/noaa/1966/999999-23174-1966.gz
/pub/data/noaa/1967/999999-23174-1967.gz
/pub/data/noaa/1968/999999-23174-1968.gz
/pub/data/noaa/1969/999999-23174-1969.gz
/pub/data/noaa/1970/999999-23174-1970.gz
/pub/data/noaa/1971/999999-23174-1971.gz
/pub/data/noaa/1972/999999-23174-1972.gz
227900 records
Saving station data to: '../data/temperatures/ISD/999999-23174-AT.h5'
Saving combined data to: 'KLAX_AT.h5'
CPU times: user 1min 40s, sys: 3 s, total: 1min 43s
Wall time: 6min 16s

In [ ]:


In [20]:
# # Plot (decomment to enable)

# if SUPPRESS_FIGURE_DISPLAY:
#     plt.ioff()
# ax = df.plot(title=STATION_CALLSIGN, marker='.')
# ax.set_ylabel('air temperature [$\degree$ C]')
# plt.savefig(os.path.join(OUTDIR,'{}_AT_orig.png'.format(STATION_CALLSIGN)))
# plt.ion()

In [ ]:


In [ ]:
## Distribution plot (decomment to enable)

# fig = plt.figure()
# ax = fig.add_subplot(1,1,1)
# sns.kdeplot(df['AT'], bw=.5, ax=ax, legend=False)
# ax.set_xlabel('air temperature [$\degree$C]')
# ax.set_ylabel('proportion of readings')

In [ ]:


In [ ]:


In [ ]: